/* 
Clean raw DrillingInfo/Enverus data and merge it with price and weather data to 
construct dataset used for empirical analysis in Gilbert & Roberts 2019, "Drill-bit
parity: Supply-side links in oil and gas markets". 

The file "header.dta" contains DrillingInfo/Enverus well header information contained
in the PDEN_DESC table from DI Desktop Raw Data PLUS. 

From this database we extracted the following fields: 
ENTITY_ID 
PROD_TYPE
GAS_GATH_NAME_1
FIRST_PROD_DATE
FIRST_LIQ
FIRST_GAS
BASIN
TOTAL_DEPTH

Price and weather data is merged in from the file MonthlyPricesWeather.dta, 

After cleaning, we save the file DrillBitParityAnalysisData.dta for empirical analysis.
*/ 

clear all
cd "C:\Users\bgilbert_a\Dropbox\Supply Side Oil Gas\BigWellData"
set more off

use "header.dta", clear /* Fields taken from Enverus/DrillingInfo HPDI PDEN_DESC data */
keep basin prod_type total_depth first_prod_date gas_gath_name_1 first_liq first_gas
keep if prod_type == "GAS" | prod_type =="OIL"

gen date = date(first_prod_date,"MDY")
format date %td
gen month = mofd(date)
format month %tm 

* Make rest of USA drilling data (outside Central U.S. for instrumental variables)
preserve
drop if basin == "ANADARKO BASIN" | basin == "CHAUTAUQUA PLATFORM" | basin == "EAST TEXAS BASIN" | ///
		basin == "FT WORTH BASIN" | basin == "PERMIAN BASIN" | ///
		basin == "ARKLA BASIN" | basin == "ARKOMA BASIN" | basin == "CENTRAL KANSAS UPLIFT" | ///
		basin == "CHEROKEE BASIN" | basin == "DENVER JULESBURG" | ///
		basin == "LAS ANIMAS ARCH" | basin == "NEMAHA ANTICLINE" | ///
		basin == "OUACHITA FOLDED BELT" | basin == "PALO DURO BASIN" | ///
		basin == "PICEANCE" | basin == "RATON" | basin == "SAN JUAN" | ///
		basin == "SEDGWICK BASIN" | basin == "SOUTH OKLAHOMA FOLDED BELT" | ///
		basin == "TEXAS & LOUISIANA GULF COAST BASIN" 
		
		
gen oil_well = 0
replace oil_well = 1 if prod_type == "OIL"

gen gas_well = 0
replace gas_well = 1 if prod_type == "GAS"

gen gather = 0
replace gather = 1 if gas_gath_name_1 != "(N/A)" & oil_well==1

* get well depths for output-per-well productivity IVs
gen oil_depth = total_depth*oil_well
gen gas_depth = total_depth*gas_well

* first month production
gen oil_targf = first_liq*oil_well
gen gas_targf = first_gas*gas_well
gen oil_assocf = first_liq*gas_well
gen gas_assoc_allf = first_gas*oil_well
gen gas_assoc_captf = first_gas*oil_well*gather

replace gather = . if gas_well==1

* collapse to monthly time series of all other onshore basins
collapse (sum) oil_targf gas_targf oil_assocf gas_assoc_allf gas_assoc_captf ///
			oil_well gas_well total_depth oil_depth gas_depth first_liq first_gas ///
			(mean) gather, by( month)

gen basin = "ZBASIN"

order month basin 
* save to append to the rest of the basins below
save "RestUSA.dta", replace
restore

* make monthly basin-level panel of initial production and well completions
keep if basin == "ANADARKO BASIN" | basin == "CHAUTAUQUA PLATFORM" | basin == "EAST TEXAS BASIN" | ///
		basin == "FT WORTH BASIN" | basin == "PERMIAN BASIN" | ///
		basin == "ARKLA BASIN" | basin == "ARKOMA BASIN" | basin == "CENTRAL KANSAS UPLIFT" | ///
		basin == "CHEROKEE BASIN" | basin == "DENVER JULESBURG" | ///
		basin == "LAS ANIMAS ARCH" | basin == "NEMAHA ANTICLINE" | ///
		basin == "OUACHITA FOLDED BELT" | basin == "PALO DURO BASIN" | ///
		basin == "PICEANCE" | basin == "RATON" | basin == "SAN JUAN" | ///
		basin == "SEDGWICK BASIN" | basin == "SOUTH OKLAHOMA FOLDED BELT" | ///
		basin == "TEXAS & LOUISIANA GULF COAST BASIN" 

gen oil_well = 0
replace oil_well = 1 if prod_type == "OIL"

gen gas_well = 0
replace gas_well = 1 if prod_type == "GAS"

gen gather = 0
replace gather = 1 if gas_gath_name_1 != "(N/A)" & oil_well==1

* get depths for output-per-well productivity IVs
gen oil_depth = total_depth*oil_well
gen gas_depth = total_depth*gas_well

* first month production
gen oil_targf = first_liq*oil_well
gen gas_targf = first_gas*gas_well
gen oil_assocf = first_liq*gas_well
gen gas_assoc_allf = first_gas*oil_well
gen gas_assoc_captf = first_gas*oil_well*gather

replace gather = . if gas_well==1

* collapse to monthly basin-level panel data
collapse (sum) oil_targf gas_targf oil_assocf gas_assoc_allf gas_assoc_captf ///
			oil_well gas_well total_depth oil_depth gas_depth first_liq first_gas ///
			(mean) gather, by(basin month)
append using "RestUSA.dta"
encode basin, gen(basin_no)

* work with natural logs of the initial production and well completions data
# delimit; 
foreach var of varlist 
		oil_targf gas_targf oil_assocf gas_assoc_allf gas_assoc_captf
		oil_well gas_well {;
		gen ln_`var' = ln(`var');
		recode `var' (mis=0);
		};
# delimit cr

* create total initial production variables
gen ln_gas_allf = ln(gas_targf + gas_assoc_allf)
recode ln_gas_allf (mis=0)
gen ln_gas_captf = ln(gas_targf + gas_assoc_captf)
recode ln_gas_captf (mis=0)
gen ln_oilf = ln(oil_targf + oil_assocf)
recode ln_oilf (mis=0)

gen oilf = oil_targf + oil_assocf
gen gasf = gas_targf + gas_assoc_captf

* create total productivity (total initial output per well) variables
gen tprodoilf = ln((oil_targf+oil_assocf)/(oil_well+gas_well))
recode tprodoilf (mis=0)
gen tprodgasf = ln((gas_targf+gas_assoc_captf)/(oil_well+gas_well))
recode tprodgasf (mis=0)

* create depth productivity variables (total initial output per foot of well depth)
gen gas_pftb = (gas_targf+gas_assoc_captf)/total_depth
gen oil_pftb = (oil_targf+oil_assocf)/total_depth
gen ln_gas_pftb = ln(gas_pftb)
recode ln_gas_pftb (mis=0)
gen ln_oil_pftb = ln(oil_pftb)
recode ln_oil_pftb (mis=0)

xtset basin_no month
* 3 month average output per foot of well depth
gen ln2_gas_pftb = ln((1/3)*(((gas_targf+gas_assoc_captf)/total_depth)+((L.gas_targf+L.gas_assoc_captf)/L.total_depth)+((L2.gas_targf+L2.gas_assoc_captf)/L2.total_depth)))
gen ln2_oil_pftb = ln((1/3)*(((oil_targf+oil_assocf)/total_depth)+((L.oil_targf+L.oil_assocf)/L.total_depth)+((L2.oil_targf+L2.oil_assocf)/L2.total_depth)))
* 3 month average target initial output per well (logs and levels)
gen prod2oilf = ln((1/3)*((oil_targf/oil_well)+(L.oil_targf/L.oil_well)+(L2.oil_targf/L2.oil_well))) 
gen prod2gasf = ln((1/3)*((gas_targf/gas_well)+(L.gas_targf/L.gas_well)+(L2.gas_targf/L2.gas_well)))
gen eprod2oilf = (1/3)*((oil_targf/oil_well)+(L.oil_targf/L.oil_well)+(L2.oil_targf/L2.oil_well)) 
gen eprod2gasf = (1/3)*((gas_targf/gas_well)+(L.gas_targf/L.gas_well)+(L2.gas_targf/L2.gas_well))
* 3 month average total initial output per well (logs and levels)
gen tprod2oilf = ln((1/3)*(((oil_targf+oil_assocf)/(oil_well+gas_well))+((L.oil_targf+L.oil_assocf)/(L.oil_well+L.gas_well))+((L2.oil_targf+L2.oil_assocf)/(L2.oil_well+L2.gas_well)))) 
gen tprod2gasf = ln((1/3)*(((gas_targf+gas_assoc_captf)/(oil_well+gas_well))+((L.gas_targf+L.gas_assoc_captf)/(L.oil_well+L.gas_well))+((L2.gas_targf+L2.gas_assoc_captf)/(L2.oil_well+L2.gas_well))))
gen etprod2oilf = (1/3)*(((oil_targf+oil_assocf)/(oil_well+gas_well))+((L.oil_targf+L.oil_assocf)/(L.oil_well+L.gas_well))+((L2.oil_targf+L2.oil_assocf)/(L2.oil_well+L2.gas_well)))
gen etprod2gasf = (1/3)*(((gas_targf+gas_assoc_captf)/(oil_well+gas_well))+((L.gas_targf+L.gas_assoc_captf)/(L.oil_well+L.gas_well))+((L2.gas_targf+L2.gas_assoc_captf)/(L2.oil_well+L2.gas_well)))
* 6 month average target initial output per well (logs and levels)
gen prod3oilf = ln((1/6)*((oil_targf/oil_well)+(L.oil_targf/L.oil_well)+(L2.oil_targf/L2.oil_well)+(L3.oil_targf/L3.oil_well)+(L4.oil_targf/L4.oil_well)+(L5.oil_targf/L5.oil_well))) 
gen prod3gasf = ln((1/6)*((gas_targf/gas_well)+(L.gas_targf/L.gas_well)+(L2.gas_targf/L2.gas_well)+(L3.gas_targf/L3.gas_well)+(L4.gas_targf/L4.gas_well)+(L5.gas_targf/L5.gas_well)))
gen eprod3oilf = (1/6)*((oil_targf/oil_well)+(L.oil_targf/L.oil_well)+(L2.oil_targf/L2.oil_well)+(L3.oil_targf/L3.oil_well)+(L4.oil_targf/L4.oil_well)+(L5.oil_targf/L5.oil_well)) 
gen eprod3gasf = (1/6)*((gas_targf/gas_well)+(L.gas_targf/L.gas_well)+(L2.gas_targf/L2.gas_well)+(L3.gas_targf/L3.gas_well)+(L4.gas_targf/L4.gas_well)+(L5.gas_targf/L5.gas_well))
* 6 month average total initial output per well (logs and levels)
gen tprod3oilf = ln((1/6)*(((oil_targf+oil_assocf)/(oil_well+gas_well))+((L.oil_targf+L.oil_assocf)/(L.oil_well+L.gas_well))+((L2.oil_targf+L2.oil_assocf)/(L2.oil_well+L2.gas_well))+((L3.oil_targf+L3.oil_assocf)/(L3.oil_well+L3.gas_well))+((L4.oil_targf+L4.oil_assocf)/(L4.oil_well+L4.gas_well))+((L5.oil_targf+L5.oil_assocf)/(L5.oil_well+L5.gas_well)))) 
gen tprod3gasf = ln((1/6)*(((gas_targf+gas_assoc_captf)/(oil_well+gas_well))+((L.gas_targf+L.gas_assoc_captf)/(L.oil_well+L.gas_well))+((L2.gas_targf+L2.gas_assoc_captf)/(L2.oil_well+L2.gas_well))+((L3.gas_targf+L3.gas_assoc_captf)/(L3.oil_well+L3.gas_well))+((L4.gas_targf+L4.gas_assoc_captf)/(L4.oil_well+L4.gas_well))+((L5.gas_targf+L5.gas_assoc_captf)/(L5.oil_well+L5.gas_well))))
gen etprod3oilf = (1/6)*(((oil_targf+oil_assocf)/(oil_well+gas_well))+((L.oil_targf+L.oil_assocf)/(L.oil_well+L.gas_well))+((L2.oil_targf+L2.oil_assocf)/(L2.oil_well+L2.gas_well))+((L3.oil_targf+L3.oil_assocf)/(L3.oil_well+L3.gas_well))+((L4.oil_targf+L4.oil_assocf)/(L4.oil_well+L4.gas_well))+((L5.oil_targf+L5.oil_assocf)/(L5.oil_well+L5.gas_well))) 
gen etprod3gasf = (1/6)*(((gas_targf+gas_assoc_captf)/(oil_well+gas_well))+((L.gas_targf+L.gas_assoc_captf)/(L.oil_well+L.gas_well))+((L2.gas_targf+L2.gas_assoc_captf)/(L2.oil_well+L2.gas_well))+((L3.gas_targf+L3.gas_assoc_captf)/(L3.oil_well+L3.gas_well))+((L4.gas_targf+L4.gas_assoc_captf)/(L4.oil_well+L4.gas_well))+((L5.gas_targf+L5.gas_assoc_captf)/(L5.oil_well+L5.gas_well)))
* log total new well completions
gen drill = ln(oil_well+gas_well)

order month basin_no

* reshape to individual basin time series instead of panel
reshape wide basin oil_targ-drill, i(month) j(basin_no)
tsset month

* cumulative past drilling and cumulative past initial production per basin
# delimit;
foreach num of numlist 1/21 {;
		gen totgaswell`num' = ln(sum(gas_well`num'));
		gen totoilwell`num' = ln(sum(oil_well`num'));
		gen totwell`num' = ln(sum(gas_well`num'+oil_well`num'));
		};
# delimit;
foreach num of numlist 1/21 {;
		gen totgas`num' = ln(sum(gasf`num'));
		gen totoil`num' = ln(sum(oilf`num'));
		gen tot`num' = ln(sum(gasf`num'+5.8*oilf`num'));
		};
# delimit cr	
		
* IV total wells in other non-adjacent basins
* Anadarko
order month basin* oil_well* gas_well*
egen and_gas = rowtotal(gas_well2 gas_well7 gas_well8 gas_well12 gas_well15 gas_well17 gas_well20 gas_well21)
replace and_gas = ln(and_gas)
egen and_oil = rowtotal(oil_well2 oil_well7 oil_well8 oil_well12 oil_well15 oil_well17 oil_well20 oil_well21)
replace and_oil = ln(and_oil)
* Chautauqua 
egen chq_oil = rowtotal(oil_well2 oil_well7-oil_well17 oil_well20 oil_well21)
egen chq_gas = rowtotal(gas_well2 gas_well7-gas_well17 gas_well20 gas_well21)
replace chq_oil = ln(chq_oil)
replace chq_gas = ln(chq_gas)
* East Texas
egen etx_oil = rowtotal(oil_well1 oil_well4-oil_well7 oil_well10 oil_well11 oil_well13-oil_well19 oil_well21)
egen etx_gas = rowtotal(gas_well1 gas_well4-gas_well7 gas_well10 gas_well11 gas_well13-gas_well19 gas_well21)
replace etx_oil = ln(etx_oil)
replace etx_gas = ln(etx_gas)
* Fort Worth 
egen fwth_oil = rowtotal(oil_well2-oil_well7 oil_well10 oil_well11 oil_well15-oil_well18 oil_well21 )
egen fwth_gas = rowtotal(gas_well2-gas_well7 gas_well10 gas_well11 gas_well15-gas_well18 gas_well21)
replace fwth_oil = ln(fwth_oil)
replace fwth_gas = ln(fwth_gas)
* Permian
egen prm_oil = rowtotal(oil_well1-oil_well8 oil_well10-oil_well12 oil_well15-oil_well19 oil_well21)
egen prm_gas = rowtotal(gas_well1-gas_well8 gas_well10-gas_well12 gas_well15-gas_well19 gas_well21)
replace prm_oil = ln(prm_oil)
replace prm_gas = ln(prm_gas)


* For regressions on the sum of all 5 study basins:
* Dependent variable for all 5 basins - sum of target initial production in all 5
order oil_targf*
egen oil_targfot = rowtotal(oil_targf1 oil_targf5 oil_targf8 oil_targf9 oil_targf14)
replace oil_targfot = ln(oil_targfot)
egen oil_targftot = rowtotal(oil_targf*)
replace oil_targftot = ln(oil_targftot)

order gas_targf*
egen gas_targfot = rowtotal(gas_targf1 gas_targf5 gas_targf8 gas_targf9 gas_targf14)
replace gas_targfot = ln(gas_targfot)
egen gas_targftot = rowtotal(gas_targf*)
replace gas_targftot = ln(gas_targftot)

* Target productivity control variable aggregated over all 5 basins
order eprod3oilf*
gen prod3oilfot = ln(eprod3oilf1 + eprod3oilf5 + eprod3oilf8 + eprod3oilf9 + eprod3oilf14)
egen prod3oilftot = rowtotal(eprod3oilf*)
replace prod3oilftot = ln(prod3oilftot)
order eprod3gasf*
gen prod3gasfot = ln(eprod3gasf1 + eprod3gasf5 + eprod3gasf8 + eprod3gasf9 + eprod3gasf14)
egen prod3gasftot = rowtotal(eprod3gasf*)
replace prod3gasftot = ln(prod3gasftot)

* Dependent variable for all 5 basins - sum of total initial production in all 5
order oilf*
egen oilfot = rowtotal(oilf1 oilf5 oilf8 oilf9 oilf14)
replace oilfot = ln(oilfot)
egen oilftot = rowtotal(oilf*)
replace oilftot = ln(oilftot)

order gasf*
egen gasfot = rowtotal(gasf1 gasf5 gasf8 gasf9 gasf14)
replace gasfot = ln(gasfot)
egen gasftot = rowtotal(gasf*)
replace gasftot = ln(gasftot)

* Total productivity control aggregated over all 5 basins
order etprod3oilf*
gen tprod3oilfot = ln(etprod3oilf1 + etprod3oilf5 + etprod3oilf8 + etprod3oilf9 + etprod3oilf14)
egen tprod3oilftot = rowtotal(etprod3oilf*)
replace tprod3oilftot = ln(tprod3oilftot)
order etprod3gasf*
gen tprod3gasfot = ln(etprod3gasf1 + etprod3gasf5 + etprod3gasf8 + etprod3gasf9 + etprod3gasf14)
egen tprod3gasftot = rowtotal(etprod3gasf*)
replace tprod3gasftot = ln(tprod3gasftot)

* Output per well productivity variables from non-adjacent basins (as IVs) 
order eprod3oilf* eprod3gasf*
* Anadarko
egen prod3oilfNan = rowtotal(eprod3oilf2   eprod3oilf7   eprod3oilf8   eprod3oilf12   eprod3oilf15   eprod3oilf17   eprod3oilf20)
egen prod3gasfNan = rowtotal(eprod3gasf2   eprod3gasf7   eprod3gasf8   eprod3gasf12   eprod3gasf15  eprod3gasf17   eprod3gasf20)
replace prod3oilfNan = ln(prod3oilfNan )
replace prod3gasfNan = ln(prod3gasfNan)
* Chautauqua
egen prod3oilfNch = rowtotal(eprod3oilf2 eprod3oilf7-eprod3oilf17 eprod3oilf20 )
egen prod3gasfNch = rowtotal(eprod3gasf2 eprod3gasf7-eprod3gasf17 eprod3gasf20 )
replace prod3oilfNch = ln(prod3oilfNch)
replace prod3gasfNch = ln(prod3gasfNch)
* East Texas
egen prod3oilfNet = rowtotal(eprod3oilf1 eprod3oilf4-eprod3oilf7 eprod3oilf10 eprod3oilf11 eprod3oilf13-eprod3oilf19 )
egen prod3gasfNet = rowtotal(eprod3gasf1 eprod3gasf4-eprod3gasf7 eprod3gasf10 eprod3gasf11 eprod3gasf13-eprod3gasf19 )
replace prod3oilfNet = ln(prod3oilfNet)
replace prod3gasfNet = ln(prod3gasfNet)
* Fort Worth
egen prod3oilfNfw = rowtotal(eprod3oilf2-eprod3oilf7 eprod3oilf10 eprod3oilf11 eprod3oilf15-eprod3oilf18  )
egen prod3gasfNfw = rowtotal(eprod3gasf2-eprod3gasf7 eprod3gasf10 eprod3gasf11 eprod3gasf15-eprod3gasf18 )
replace prod3oilfNfw = ln(prod3oilfNfw)
replace prod3gasfNfw = ln(prod3gasfNfw)
* Permian
egen prod3oilfNpr = rowtotal(eprod3oilf1-eprod3oilf8 eprod3oilf10-eprod3oilf12 eprod3oilf15-eprod3oilf19 )
egen prod3gasfNpr = rowtotal(eprod3gasf1-eprod3gasf8 eprod3gasf10-eprod3gasf12 eprod3gasf15-eprod3gasf19 )
replace prod3oilfNpr = ln(prod3oilfNpr)
replace prod3gasfNpr = ln(prod3gasfNpr)

/* 
Drilling in non-adjacent basins for IVs: 
Anadarko 1. Neighbors: southern ok fold (19), palo duro (13), chautauqua (5), sedgwick (18), 
							ouachita (12), las animas (10), denver (7), raton (16), san juan (17),
							ft worth (9), arkoma (3), central kansas (4)
Chautauqua 5. Neighbors: anadarko (1), southern ok fold (19), arkoma (3), ouachita (12), arkla (2),
							sedgwick (18), nemaha (11), ft worth (9), cherokee (6)
East Texas 8. Neighbors: tx&la coast (20), ouachita (12), ft worth (9), arkla (2), arkoma (3)

Ft Worth 9. Neighbors: ouachita (12), east texas (8), southern ok fold (19), palo duro (13), 
							permian (14), anadarko (1)
Permian 14. Neighbors: palo duro (13), san juan (17), raton (16), ft worth (9), tx&la coast (20)
*/
order basin* oil_well* gas_well*
* Anadarko
* non-neighbors: arkla, denver, east texas, ouachita, piceance, san juan, tx & la coast, 
egen an2_oil = rowtotal(oil_well2 oil_well7 oil_well8 oil_well12 oil_well15 oil_well17 oil_well20)
egen an2_gas = rowtotal(gas_well2 gas_well7 gas_well8 gas_well12 gas_well15 gas_well17 gas_well20)
replace an2_oil = ln(an2_oil)
replace an2_gas = ln(an2_gas)
* neighbors: arkoma, central kansas, chautauqua, cherokee, fort worth, las animas, nemaha, 
* palo duro, raton, sedgwick, southern ok fold

* Chautauqua Platform
* non-neighbors: arkla, denver, etc. (everything not on the neighbor list)
egen ch2_oil = rowtotal(oil_well2 oil_well7-oil_well17 oil_well20)
egen ch2_gas = rowtotal(gas_well2 gas_well7-gas_well17 gas_well20)
replace ch2_oil = ln(ch2_oil)
replace ch2_gas = ln(ch2_gas)
* neighbors: anadarko, arkoma, central kansas, cherokee, sedgwick, southern ok fold, 

* East Texas
* non-neighbors: everything not on the neighbor list
egen et2_oil = rowtotal(oil_well1 oil_well4-oil_well7 oil_well10 oil_well11 oil_well13-oil_well19)
egen et2_gas = rowtotal(gas_well1 gas_well4-gas_well7 gas_well10 gas_well11 gas_well13-gas_well19)
replace et2_oil = ln(et2_oil)
replace et2_gas = ln(et2_gas)
* neighbors: arkla, arkoma, fort worth, ouachita, tx & la gulf

* Fort Worth
* non-neighbors: arkla, arkoma, central kansas, chautauqua, cherokee, denver, 
* las animas, nemaha, piceance, raton, san juan, sedgwick, 
egen fw2_oil = rowtotal(oil_well2-oil_well7 oil_well10 oil_well11 oil_well15-oil_well18 )
egen fw2_gas = rowtotal(gas_well2-gas_well7 gas_well10 gas_well11 gas_well15-gas_well18)
replace fw2_oil = ln(fw2_oil)
replace fw2_gas = ln(fw2_gas)
* neighbors: east texas, ouachita, palo duro, permian, southern ok fold, tx & la coast

* Permian
* non-neighbors:
egen pr2_oil = rowtotal(oil_well1-oil_well8 oil_well10-oil_well12 oil_well15-oil_well19)
egen pr2_gas = rowtotal(gas_well1-gas_well8 gas_well10-gas_well12 gas_well15-gas_well19)
replace pr2_oil = ln(pr2_oil)
replace pr2_gas = ln(pr2_gas)
* neighbors: fort worth, palo duro, tx & la gulf

sort month
drop if month > tm(2016m8)

merge 1:1 month using "MonthlyPricesWeather.dta", nogen
save "DrillBitParityAnalysisData.dta", replace
